In [1]:
#!pip install praw
Collecting praw
  Downloading praw-7.6.1-py3-none-any.whl (188 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188.8/188.8 KB 2.0 MB/s eta 0:00:0000:0100:01
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Requirement already satisfied: websocket-client>=0.54.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from praw) (1.4.2)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Requirement already satisfied: requests<3.0,>=2.6.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from prawcore<3,>=2.1->praw) (2.28.1)
Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (1.26.13)
Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.1.1)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0
In [100]:
import praw
import pandas as pd
 
reddit_read_only = praw.Reddit(client_id="-FkFx07VGHhRLGZW9CNuRw",         # your client id
                               client_secret="LP6ZvPq-t4OmaNm8xtIjkxHLRC7N0A",      # your client secret
                               user_agent="MK scraper")        # your user agent
 
 
subrdit = reddit_read_only.subreddit("AmITheAsshole")
 
# Display the name of the Subreddit
#print("Display Name:", subrdit.display_name)
 
# Display the description of the Subreddit
#print("Description:", subrdit.description)
In [101]:
subreddit = reddit_read_only.subreddit("AmITheAsshole")
 
for post in subreddit.top(limit=5):
    print(post.title)
    print()
AITA for telling my wife the lock on my daughter's door does not get removed til my brother inlaw and his daughters are out of our house?

META: This sub is moving towards a value system that frequently doesn't align with the rest of the world

UPDATE, AITA for despising my mentally handicap sister?

AITA For suing my girlfriend after she had my 1967 impala project taken to the scrapyard?

AITA for bringing my SIL’s wallet to the restaurant when she conveniently always forgets it?

In [211]:
posts = subreddit.top("year", limit = 800)

posts_dict = {'title' : [], 'body': [], 'score': [], 'id': [], 'top_comment_body' : [], 'top_comment_score': [], 'url': []}

i=0
for post in posts:
    # Title of each post
    posts_dict["title"].append(post.title)
     
    # Text inside a post
    posts_dict["body"].append(post.selftext)
     
    # Unique ID of each post
    posts_dict["id"].append(post.id)
     
    # The score of a post
    posts_dict["score"].append(post.score)
    
    # Text inside the top comment of the post
    posts_dict["top_comment_body"].append(post.comments[1].body)
    
    # Score of the top comment of the post
    posts_dict["top_comment_score"].append(post.comments[1].score)
     
    # URL of each post
    posts_dict["url"].append(post.url)
    
    if i%10 == 0:
        print("Done with post number ", i)
    i += 1
    
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts
Done with post number  0
Done with post number  10
Done with post number  20
Done with post number  30
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Input In [211], in <cell line: 6>()
     17 posts_dict["score"].append(post.score)
     19 # Text inside the top comment of the post
---> 20 posts_dict["top_comment_body"].append(post.comments[1].body)
     22 # Score of the top comment of the post
     23 posts_dict["top_comment_score"].append(post.comments[1].score)

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/models/reddit/base.py:34, in RedditBase.__getattr__(self, attribute)
     32 """Return the value of ``attribute``."""
     33 if not attribute.startswith("_") and not self._fetched:
---> 34     self._fetch()
     35     return getattr(self, attribute)
     36 raise AttributeError(
     37     f"{self.__class__.__name__!r} object has no attribute {attribute!r}"
     38 )

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/models/reddit/submission.py:634, in Submission._fetch(self)
    633 def _fetch(self):
--> 634     data = self._fetch_data()
    635     submission_listing, comment_listing = data
    636     comment_listing = Listing(self._reddit, _data=comment_listing["data"])

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/models/reddit/submission.py:631, in Submission._fetch_data(self)
    629 name, fields, params = self._fetch_info()
    630 path = API_PATH[name].format(**fields)
--> 631 return self._reddit.request(method="GET", params=params, path=path)

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/util/deprecate_args.py:43, in _deprecate_args.<locals>.wrapper.<locals>.wrapped(*args, **kwargs)
     36     arg_string = _generate_arg_string(_old_args[: len(args)])
     37     warn(
     38         f"Positional arguments for {func.__qualname__!r} will no longer be"
     39         f" supported in PRAW 8.\nCall this function with {arg_string}.",
     40         DeprecationWarning,
     41         stacklevel=2,
     42     )
---> 43 return func(**dict(zip(_old_args, args)), **kwargs)

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/praw/reddit.py:941, in Reddit.request(self, data, files, json, method, params, path)
    939     raise ClientException("At most one of 'data' or 'json' is supported.")
    940 try:
--> 941     return self._core.request(
    942         data=data,
    943         files=files,
    944         json=json,
    945         method=method,
    946         params=params,
    947         path=path,
    948     )
    949 except BadRequest as exception:
    950     try:

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/sessions.py:330, in Session.request(self, method, path, data, files, json, params, timeout)
    328     json["api_type"] = "json"
    329 url = urljoin(self._requestor.oauth_url, path)
--> 330 return self._request_with_retries(
    331     data=data,
    332     files=files,
    333     json=json,
    334     method=method,
    335     params=params,
    336     timeout=timeout,
    337     url=url,
    338 )

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/sessions.py:228, in Session._request_with_retries(self, data, files, json, method, params, timeout, url, retry_strategy_state)
    226 retry_strategy_state.sleep()
    227 self._log_request(data, method, params, url)
--> 228 response, saved_exception = self._make_request(
    229     data,
    230     files,
    231     json,
    232     method,
    233     params,
    234     retry_strategy_state,
    235     timeout,
    236     url,
    237 )
    239 do_retry = False
    240 if (
    241     response is not None
    242     and response.status_code == codes["unauthorized"]
    243 ):

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/sessions.py:185, in Session._make_request(self, data, files, json, method, params, retry_strategy_state, timeout, url)
    173 def _make_request(
    174     self,
    175     data,
   (...)
    182     url,
    183 ):
    184     try:
--> 185         response = self._rate_limiter.call(
    186             self._requestor.request,
    187             self._set_header_callback,
    188             method,
    189             url,
    190             allow_redirects=False,
    191             data=data,
    192             files=files,
    193             json=json,
    194             params=params,
    195             timeout=timeout,
    196         )
    197         log.debug(
    198             f"Response: {response.status_code}"
    199             f" ({response.headers.get('content-length')} bytes)"
    200         )
    201         return response, None

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/rate_limit.py:34, in RateLimiter.call(self, request_function, set_header_callback, *args, **kwargs)
     32 self.delay()
     33 kwargs["headers"] = set_header_callback()
---> 34 response = request_function(*args, **kwargs)
     35 self.update(response.headers)
     36 return response

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/prawcore/requestor.py:58, in Requestor.request(self, timeout, *args, **kwargs)
     56 """Issue the HTTP request capturing any errors that may occur."""
     57 try:
---> 58     return self._http.request(
     59         *args, timeout=timeout or self.timeout, **kwargs
     60     )
     61 except Exception as exc:
     62     raise RequestException(exc, args, kwargs)

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/sessions.py:587, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    582 send_kwargs = {
    583     "timeout": timeout,
    584     "allow_redirects": allow_redirects,
    585 }
    586 send_kwargs.update(settings)
--> 587 resp = self.send(prep, **send_kwargs)
    589 return resp

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/sessions.py:745, in Session.send(self, request, **kwargs)
    742         pass
    744 if not stream:
--> 745     r.content
    747 return r

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/models.py:899, in Response.content(self)
    897         self._content = None
    898     else:
--> 899         self._content = b"".join(self.iter_content(CONTENT_CHUNK_SIZE)) or b""
    901 self._content_consumed = True
    902 # don't need to release the connection; that's been handled by urllib3
    903 # since we exhausted the data.

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/requests/models.py:816, in Response.iter_content.<locals>.generate()
    814 if hasattr(self.raw, "stream"):
    815     try:
--> 816         yield from self.raw.stream(chunk_size, decode_content=True)
    817     except ProtocolError as e:
    818         raise ChunkedEncodingError(e)

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/urllib3/response.py:628, in HTTPResponse.stream(self, amt, decode_content)
    626 else:
    627     while not is_fp_closed(self._fp):
--> 628         data = self.read(amt=amt, decode_content=decode_content)
    630         if data:
    631             yield data

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/urllib3/response.py:567, in HTTPResponse.read(self, amt, decode_content, cache_content)
    564 fp_closed = getattr(self._fp, "closed", False)
    566 with self._error_catcher():
--> 567     data = self._fp_read(amt) if not fp_closed else b""
    568     if amt is None:
    569         flush_decoder = True

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages/urllib3/response.py:533, in HTTPResponse._fp_read(self, amt)
    530     return buffer.getvalue()
    531 else:
    532     # StringIO doesn't like amt=None
--> 533     return self._fp.read(amt) if amt is not None else self._fp.read()

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/http/client.py:459, in HTTPResponse.read(self, amt)
    456 if amt is not None:
    457     # Amount is given, implement using readinto
    458     b = bytearray(amt)
--> 459     n = self.readinto(b)
    460     return memoryview(b)[:n].tobytes()
    461 else:
    462     # Amount is not given (unbounded read) so we must check self.length
    463     # and self.chunked

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/http/client.py:503, in HTTPResponse.readinto(self, b)
    498         b = memoryview(b)[0:self.length]
    500 # we do not use _safe_read() here because this may be a .will_close
    501 # connection, and the user is reading more bytes than will be provided
    502 # (for example, reading in 1k chunks)
--> 503 n = self.fp.readinto(b)
    504 if not n and b:
    505     # Ideally, we would raise IncompleteRead if the content-length
    506     # wasn't satisfied, but it might break compatibility.
    507     self._close_conn()

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b)
    667 while True:
    668     try:
--> 669         return self._sock.recv_into(b)
    670     except timeout:
    671         self._timeout_occurred = True

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/ssl.py:1241, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1237     if flags != 0:
   1238         raise ValueError(
   1239           "non-zero flags not allowed in calls to recv_into() on %s" %
   1240           self.__class__)
-> 1241     return self.read(nbytes, buffer)
   1242 else:
   1243     return super().recv_into(buffer, nbytes, flags)

File /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/ssl.py:1099, in SSLSocket.read(self, len, buffer)
   1097 try:
   1098     if buffer is not None:
-> 1099         return self._sslobj.read(len, buffer)
   1100     else:
   1101         return self._sslobj.read(len)

KeyboardInterrupt: 
In [ ]:
top_posts.to_csv('TM_project/reddit_posts.csv', index=False)
In [ ]:
import pickle # for loading (and saving) the previously web scraped data

import pandas as pd # for processing data in dataframes
import matplotlib.pyplot as plt # for plotting

import re # for cleaning textual data (uses regular expressions ouch!)
from collections import Counter # for counting tokens occurences
import math # for calculations

import nltk
from nltk.tokenize import word_tokenize # for tokenization
from nltk.stem import PorterStemmer # for stemming
from nltk.corpus import stopwords

# import stop_words # source: https://pypi.org/project/stop-words/#installation
# from stop_words import get_stop_words # alternative stopwords list

import gensim
from gensim import corpora # for: Dictionary(), word2bow()
from gensim import models # for: TfidfModel()

import statistics # for: quantiles()

import numpy as np # for some maths

import time # for measuring time of computation

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
In [ ]:
#removing \n
post_texts = top_posts["body"].map(lambda x: re.sub('\n', ' ', x))

#removing all numbers
post_texts = post_texts.map(lambda x: re.sub(r'[0-9]+', '', x))

#removing ,\!?/:;''()``’“-”—#
post_texts = post_texts.map(lambda x: re.sub("[,\!?/:;''()``’“-”—#]", '', x))

#removing .
post_texts = post_texts.map(lambda x: re.sub(r"([.]+)", '', x))

#all letters to lover case
post_texts = post_texts.map(lambda x: x.lower())

#removing one-letter words
post_texts = post_texts.map(lambda x: re.sub(r'\b\w\b', '', x))

post_texts
In [ ]:
#word_tokenize() applied to every single text
for i in range(0,len(post_texts)):
    post_texts[i] = word_tokenize(post_texts[i])
    
post_texts
In [ ]:
ps = PorterStemmer()

for i in range(0,len(post_texts)):
    
    words = []
    for word in post_texts[i]:
        words.append(ps.stem(word)) #stems every token in document and append it to a list
                                    #it takes few minutes

post_texts
In [ ]:
stop_words = nltk.corpus.stopwords.words('english') #one of stopwords dictionaries available in Python

# cleaning stopwords
stop_words = pd.Series(stop_words).map(lambda x: re.sub('\n', '', x))
stop_words = stop_words.map(lambda x: re.sub("[,\!?/:;''()``]", '', x))
stop_words = stop_words.map(lambda x: re.sub(r"([.]+)", '', x))

# stemming stopwords
ps = PorterStemmer()
for i in range(0,len(stop_words)):
    stop_words[i] = ps.stem(stop_words[i])
In [ ]:
#making stopwords back a list
stop_words = list(stop_words)

#adding some specific stopwords
stop_words.append('``')
stop_words.append("\'\'")
In [ ]:
# removing stopwords from post texts
for i in range(0,len(post_texts)):
    post_texts[i] = [word for word in post_texts[i] if not word in list(stop_words)]
post_texts
In [ ]:
top_posts["body_clean"] = post_texts
top_posts.head()
In [ ]:
def generate_ngrams(text, ngram = 1):
    temp = zip(*[text[i:] for i in range(0,ngram)]) # set with pairs, three, ..., ns of tokens
    ans = [' '.join(ngram) for ngram in temp] # joins the elements in strings
    ans = pd.Series(ans).map(lambda x: re.sub(" ", '_', x)) # replaces spaces with '_'
    return list(ans)
In [ ]:
for i in range(0,len(post_texts)):
    unigrams = post_texts[i]
    bigrams = generate_ngrams(post_texts[i], ngram = 2)
    trigrams = generate_ngrams(post_texts[i], ngram = 3)
    
    text = []
    text.append(unigrams)
    text.append(bigrams)
    text.append(trigrams)
    
    post_texts[i] = [item for sublist in text for item in sublist] 

post_texts
In [ ]:
#dictionary from gensim library = keys are: 1, 2, 3, ..., number of tokens; values are tokens' names
dictionary = corpora.Dictionary(post_texts) 

#corpus from gensim library consists of so called bows
#every bow = keys are tokens' indexes; values are numbers of tokens' occurences in text
corpus = [dictionary.doc2bow(text) for text in post_texts]
In [ ]:
tfidf_model = models.TfidfModel(corpus, id2word = dictionary)
In [ ]:
def TFIDF(dictionary, corpus, which_text, tfidf_model):
    bow = corpus[which_text]
    tfidfdictionary = dict(tfidf_model[bow]) #TFIDF for tokens in a chosen text
    
    #below: keys are tokens' names; values are numbers of tokens' occurences in text
    TFIDFdictionary = dict((dictionary[key], value) for (key, value) in tfidfdictionary.items())
    
    return(TFIDFdictionary)
In [ ]:
TFIDF(dictionary, corpus, 0, tfidf_model)
In [ ]:
d_tfidf = {}

for i in range(0,len(corpus)): # for each text
    data = TFIDF(dictionary, corpus, i, tfidf_model) # calculate TFIDF values for text's tokens

    for token, value in data.items(): # next, for each token and its TFIDF value in text, prepare a dictionary
                                      # with tokens' names as keys and list of TF-IDFs as values
        d_tfidf.setdefault(token, []).append(value)
In [ ]:
tfidf_values = [item for sublist in list(d_tfidf.values()) for item in sublist]

plt.hist(tfidf_values, bins=1000)
plt.xlabel('TF-IDF')
plt.ylabel('Number of tokens with certain TF-IDF value')
plt.xlim([0, 0.1])
plt.show()
In [ ]:
for i in [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5]:
    print('Quantile ',i*100,'%: ',np.quantile(tfidf_values,i),sep='')
In [ ]:
import pickle # for saving objects

import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px # for nice plotting

import warnings

import math

from nltk.tokenize import RegexpTokenizer # for LSA in sklearn, we will need additional tokenizer

from sklearn.feature_extraction.text import CountVectorizer # one can consider LSA with DF in DTM...
from sklearn.feature_extraction.text import TfidfVectorizer # or with TF-IDF values in DTM

from sklearn.decomposition import LatentDirichletAllocation # LDA implementation

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
In [ ]:
# as our preprocessed data is already tokenized
# therefore, we need to make them strings again...

def listToString(s):  
    str1 = ""   
    for ele in s:  
        str1 += ele+" "    
    return str1  

top_posts["body_clean_str"] = top_posts["body_clean"] # new column, for now a copy of tokenized and preprocessed texts
for i in range(0,len(top_posts)):
    top_posts["body_clean_str"][i] = listToString(top_posts["body_clean_str"][i])
    
top_posts.head()
In [ ]:
warnings.filterwarnings("ignore") #ignoring popping up warnings

tokenizer = RegexpTokenizer(r'\w+') # tokenizer

tf_vectorizer = CountVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
                                max_df = 0.75, #filtering with document frequency
                                min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
                                tokenizer = tokenizer.tokenize
)
tf = tf_vectorizer.fit_transform(top_posts["body_clean_str"])
tf_feature_names = tf_vectorizer.get_feature_names()

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
                                   max_df = 0.75, #filtering with document frequency
                                   min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
                                   tokenizer = tokenizer.tokenize
)
tfidf = tfidf_vectorizer.fit_transform(top_posts["body_clean_str"])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
In [ ]:
def get_umass_score(dt_matrix, i, j):
    zo_matrix = (dt_matrix > 0).astype(int)
    col_i, col_j = zo_matrix[:, i], zo_matrix[:, j]
    col_ij = col_i + col_j
    col_ij = (col_ij == 2).astype(int)    
    Di, Dij = col_i.sum(), col_ij.sum()    
    return math.log((Dij + 1) / Di)

def get_topic_coherence(dt_matrix, topic, n_top_words):
    indexed_topic = zip(topic, range(0, len(topic)))
    topic_top = sorted(indexed_topic, key=lambda x: 1 - x[0])[0:n_top_words]
    coherence = 0
    for j_index in range(0, len(topic_top)):
        for i_index in range(0, j_index - 1):
            i = topic_top[i_index][1]
            j = topic_top[j_index][1]
            coherence += get_umass_score(dt_matrix, i, j)
    return coherence

def get_average_topic_coherence(dt_matrix, topics, n_top_words):
    total_coherence = 0
    for i in range(0, len(topics)):
        total_coherence += get_topic_coherence(dt_matrix, topics[i], n_top_words)
    return total_coherence / len(topics)
In [ ]:
measures_specific = []

for n_topics in range(2,51,1):
    
    print('Trying parameters:', n_topics)
    
    lda = LatentDirichletAllocation(n_components = n_topics, 
                                    learning_method = 'online',
                                    learning_offset = 50.0,
                                    max_iter = 5, 
                                    random_state = 42)
                
    lda.fit(tf)
    
    avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)

    measures_specific.append([avg_coherence, n_topics])
In [ ]:
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_df_lda = pd.DataFrame(measures_specific).rename(columns={
    0: 'avg_coherence', 1: 'n_topics'
})

save_object(measures_specific_df_lda, 'TM_project/measures_specific_df_lda.pkl')
In [ ]:
with open("TM_project/measures_specific_df_lda.pkl", "rb") as fp:
    measures_specific_df_lda = pickle.load(fp)
In [ ]:
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_df_lda['n_topics'],measures_specific_df_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
In [ ]:
measures_specific_df_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
In [ ]:
warnings.filterwarnings("ignore") #ignoring popping up warnings

measures_specific = []

for n_topics in range(2,51,1):
    
    print('Trying parameters:', n_topics)
    
    lda = LatentDirichletAllocation(n_components = n_topics, 
                                    learning_method = 'online',
                                    learning_offset = 50.0,
                                    max_iter = 5, 
                                    random_state = 42)
                
    lda.fit(tfidf)
    
    avg_coherence = get_average_topic_coherence(tfidf, lda.components_, 25)

    measures_specific.append([avg_coherence, n_topics])
In [ ]:
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_tfidf_lda = pd.DataFrame(measures_specific).rename(columns={
    0: 'avg_coherence', 1: 'n_topics'
})

save_object(measures_specific_tfidf_lda, 'TM_project/measures_specific_tfidf_lda.pkl')
In [ ]:
with open("TM_project/measures_specific_tfidf_lda.pkl", "rb") as fp:
    measures_specific_tfidf_lda = pickle.load(fp)
In [ ]:
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_tfidf_lda['n_topics'],measures_specific_tfidf_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
In [ ]:
measures_specific_tfidf_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
In [132]:
# here we consider the previously presented LatentDirichletAllocation() function, still with less parameters
lda = LatentDirichletAllocation(n_components = 10, 
                                learning_method = 'online', 
                                learning_offset = 80.0,
                                max_iter = 5, 
                                random_state = 42)
lda.fit(tfidf)
Out[132]:
LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
                          max_iter=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
                          max_iter=5, random_state=42)
In [ ]:
for index, component in enumerate(lda.components_): #taking model's components 
                                                    #(values from reconstructed Document-Term Matrix)
    zipped = zip(tf_feature_names, component) #taking together tokens' names with components
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
    top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
    
    print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0:  ['his', 'his mom', 'college', 'school', 'princess', 'asked why said', 'family', 'girls', 'mom', 'going']
Topic 1:  ['listen', 'anymore', 'hes', 'big', 'his', 'asking was', 'begging', 'wedding', 'special', 'celebratory dinner']
Topic 2:  ['adam', 'work', 'shouting', 'think its', 'this morning', 'dog', 'very much', 'drink', 'fair', 'would like']
Topic 3:  ['his', 'family', 'house', 'hold', 'started', 'like', 'get', 'husband', 'mom', 'friend']
Topic 4:  ['approached', 'full time', 'year old daughter', 'welcome', 'dead', 'couple days', 'cancelled', 'comment', 'bother', 'son his']
Topic 5:  ['his', 'husband', 'im', 'mom', 'sister', 'family', 'like', 'wife', 'get', 'parents']
Topic 6:  ['his', 'brother', 'blue', 'years', 'new', 'im', 'dead', 'job', 'husband', 'fund']
Topic 7:  ['im', 'sister', 'carrying', 'parents', 'went', 'ive', 'was serious', 'income', 'saturday', 'still']
Topic 8:  ['son', 'husband', 'tradition', 'family dinner', 'said was', 'dress', 'always', 'his', 'give', 'heart']
Topic 9:  ['want', 'husband', 'daughter', 'made', 'its', 'gone', 'cake', 'wife', 'thats', 'any']
In [82]:
params = []
for alpha in [0.0001, 0.001, 0.01, 0.05, 0.1]:
    for beta in [0.0001, 0.001, 0.01, 0.05, 0.1]:
        for vectorizer_name in ['tf','tf-idf']:
            
            if(vectorizer_name == 'tf'):
                print(alpha, beta, 'tf')
                lda = LatentDirichletAllocation(n_components = 10, 
                                                doc_topic_prior = alpha,
                                                topic_word_prior = beta,
                                                learning_method = 'online', 
                                                learning_offset = 50.0,
                                                max_iter = 5, 
                                                random_state = 42)
                lda.fit(tf)
                avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
                params.append([alpha, beta, 'tf', avg_coherence])
                    
            if(vectorizer_name == 'tf-idf'):
                print(alpha, beta, 'tf-idf')
                lda = LatentDirichletAllocation(n_components = 10, 
                                                doc_topic_prior = alpha,
                                                topic_word_prior = beta,
                                                learning_method = 'online', 
                                                learning_offset = 50.0,
                                                max_iter = 5, 
                                                random_state = 42)
                lda.fit(tfidf)
                avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
                params.append([alpha, beta, 'tf-idf', avg_coherence])
0.0001 0.0001 tf
0.0001 0.0001 tf-idf
0.0001 0.001 tf
0.0001 0.001 tf-idf
0.0001 0.01 tf
0.0001 0.01 tf-idf
0.0001 0.05 tf
0.0001 0.05 tf-idf
0.0001 0.1 tf
0.0001 0.1 tf-idf
0.001 0.0001 tf
0.001 0.0001 tf-idf
0.001 0.001 tf
0.001 0.001 tf-idf
0.001 0.01 tf
0.001 0.01 tf-idf
0.001 0.05 tf
0.001 0.05 tf-idf
0.001 0.1 tf
0.001 0.1 tf-idf
0.01 0.0001 tf
0.01 0.0001 tf-idf
0.01 0.001 tf
0.01 0.001 tf-idf
0.01 0.01 tf
0.01 0.01 tf-idf
0.01 0.05 tf
0.01 0.05 tf-idf
0.01 0.1 tf
0.01 0.1 tf-idf
0.05 0.0001 tf
0.05 0.0001 tf-idf
0.05 0.001 tf
0.05 0.001 tf-idf
0.05 0.01 tf
0.05 0.01 tf-idf
0.05 0.05 tf
0.05 0.05 tf-idf
0.05 0.1 tf
0.05 0.1 tf-idf
0.1 0.0001 tf
0.1 0.0001 tf-idf
0.1 0.001 tf
0.1 0.001 tf-idf
0.1 0.01 tf
0.1 0.01 tf-idf
0.1 0.05 tf
0.1 0.05 tf-idf
0.1 0.1 tf
0.1 0.1 tf-idf
In [83]:
# below, we make the output (list) a pandas DataFrame with intuitive colnames
params_df = pd.DataFrame(params).rename(columns={
    0: 'alpha', 1: 'beta', 2: 'vectorizer', 3: 'avg_coherence'
})

save_object(params_df, 'TM_project/params_df.pkl')
In [133]:
with open("TM_project/params_df.pkl", "rb") as fp:
    params_df = pickle.load(fp)
In [134]:
params_df.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
Out[134]:
alpha beta vectorizer avg_coherence
4 0.0001 0.0100 tf -240.468495
14 0.0010 0.0100 tf -240.489874
0 0.0001 0.0001 tf -240.715091
2 0.0001 0.0010 tf -240.715091
10 0.0010 0.0001 tf -240.778999
12 0.0010 0.0010 tf -240.778999
6 0.0001 0.0500 tf -240.823886
16 0.0010 0.0500 tf -240.887794
8 0.0001 0.1000 tf -241.310168
In [135]:
fig = px.scatter(params_df[params_df['vectorizer']=='tf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
In [136]:
fig = px.scatter(params_df[params_df['vectorizer']=='tf-idf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
In [144]:
lda = LatentDirichletAllocation(n_components = 10, # let us stay with 30, as that is what topic coherence initially recommended 
                                doc_topic_prior = 0.0001,
                                topic_word_prior = 0.0100,
                                learning_method = 'online', 
                                learning_offset = 10.0,
                                max_iter = 20, 
                                random_state = 42)
lda.fit(tf) # TF for now

topics_lists = []

for index, component in enumerate(lda.components_): #taking model's components 
                                                    #(values from reconstructed Document-Term Matrix)
    zipped = zip(tf_feature_names, component) #taking together tokens' names with components
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
    top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
    
    topics_lists.append(top_terms_list)
    print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0:  ['teacher', 'class', 'teaching', 'teach', 'bedroom', 'girls', 'email', 'asking', 'boundaries', 'five']
Topic 1:  ['gf', 'seat', 'uncomfortable', 'next', 'feel uncomfortable', 'nasty', 'plane', 'flight', 'empty', 'quietly']
Topic 2:  ['work', 'hr', 'coworkers', 'email', 'saying', 'meeting', 'like', 'sent', 'office', 'inappropriate']
Topic 3:  ['results', 'gifts', 'gift', 'relatives', 'appointment', 'first', 'familys', 'expecting', 'babys', 'well']
Topic 4:  ['mom', 'his', 'wedding', 'dress', 'im', 'us', 'his mom', 'made', 'one', 'get']
Topic 5:  ['his', 'im', 'like', 'family', 'would', 'get', 'time', 'has', 'its', 'one']
Topic 6:  ['dress', 'wedding', 'color', 'wearing', 'asked', 'white', 'wear', 'would', 'dress was', 'bride']
Topic 7:  ['im', 'flight', 'his friends', 'maybe', 'guy', 'tell', 'trying', 'plane', 'talk', 'sleep']
Topic 8:  ['his', 'husband', 'mom', 'im', 'home', 'went', 'like', 'his mom', 'dinner', 'get']
Topic 9:  ['his', 'cake', 'husband', 'its', 'im', 'daughter', 'like', 'go', 'being', 'kids']
In [139]:
import os
import openai
from IPython.display import Image
from IPython import display
from base64 import b64decode
In [140]:
openai.api_key = ""
In [175]:
images = []

for i in range(len(topics_lists)):
    
    try:
        topic_prompt = " ".join(topics_lists[i])

        response = openai.Image.create(
            prompt=topic_prompt,
            n=1,
            size="512x512",
            response_format="b64_json"
        )

        images.append((i,response['data'][0]['b64_json']))

        print(i)
    except:
        images.append((i, np.nan))
        print(i)
        print("too NSFW for OpenAI")
0
too NSFW for OpenAI
1
2
3
4
5
6
7
8
9
In [176]:
image_df = pd.DataFrame(images, columns =['topic', 'image'])
image_df["words"] = topics_lists

image_df.head()
Out[176]:
topic image words
0 0 NaN [teacher, class, teaching, teach, bedroom, gir...
1 1 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [gf, seat, uncomfortable, next, feel uncomfort...
2 2 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [work, hr, coworkers, email, saying, meeting, ...
3 3 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [results, gifts, gift, relatives, appointment,...
4 4 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [mom, his, wedding, dress, im, us, his mom, ma...
In [177]:
len(image_df)
Out[177]:
10
In [178]:
for i in range(len(image_df)):
    print("Image for topic ", i, " with words:")
    print(image_df.loc[i,"words"])
    try:
        display.display(display.Image(b64decode(image_df.loc[i,"image"])))
    except:
        print("Too NSFW for OpenAI")
Image for topic  0  with words:
['teacher', 'class', 'teaching', 'teach', 'bedroom', 'girls', 'email', 'asking', 'boundaries', 'five']
Too NSFW for OpenAI
Image for topic  1  with words:
['gf', 'seat', 'uncomfortable', 'next', 'feel uncomfortable', 'nasty', 'plane', 'flight', 'empty', 'quietly']
Image for topic  2  with words:
['work', 'hr', 'coworkers', 'email', 'saying', 'meeting', 'like', 'sent', 'office', 'inappropriate']
Image for topic  3  with words:
['results', 'gifts', 'gift', 'relatives', 'appointment', 'first', 'familys', 'expecting', 'babys', 'well']
Image for topic  4  with words:
['mom', 'his', 'wedding', 'dress', 'im', 'us', 'his mom', 'made', 'one', 'get']
Image for topic  5  with words:
['his', 'im', 'like', 'family', 'would', 'get', 'time', 'has', 'its', 'one']
Image for topic  6  with words:
['dress', 'wedding', 'color', 'wearing', 'asked', 'white', 'wear', 'would', 'dress was', 'bride']
Image for topic  7  with words:
['im', 'flight', 'his friends', 'maybe', 'guy', 'tell', 'trying', 'plane', 'talk', 'sleep']
Image for topic  8  with words:
['his', 'husband', 'mom', 'im', 'home', 'went', 'like', 'his mom', 'dinner', 'get']
Image for topic  9  with words:
['his', 'cake', 'husband', 'its', 'im', 'daughter', 'like', 'go', 'being', 'kids']
In [199]:
df_topics_for_posts = pd.DataFrame(lda.transform(tf).tolist())

df_topics_for_posts.head()
Out[199]:
0 1 2 3 4 5 6 7 8 9
0 6.849268e-07 6.849268e-07 6.849268e-07 6.849268e-07 6.849268e-07 7.937264e-01 6.849268e-07 6.849268e-07 2.062681e-01 6.849268e-07
1 4.347807e-07 4.347807e-07 4.347807e-07 4.347807e-07 4.347807e-07 9.999961e-01 4.347807e-07 4.347807e-07 4.347807e-07 4.347807e-07
2 4.629608e-07 4.629608e-07 4.629608e-07 4.629608e-07 4.629608e-07 8.314305e-01 3.879889e-02 4.629608e-07 1.297673e-01 4.629608e-07
3 3.787864e-07 8.905409e-03 3.787864e-07 3.787864e-07 3.787864e-07 7.637727e-01 3.787864e-07 3.787864e-07 2.273193e-01 3.787864e-07
4 3.831403e-07 3.831403e-07 3.831403e-07 3.831403e-07 3.831403e-07 3.831403e-07 3.831403e-07 3.831403e-07 9.999966e-01 3.831403e-07
In [208]:
top_posts_final = pd.merge(top_posts, round(df_topics_for_posts*100, 3), left_index=True, right_index=True)

top_posts_final.head()
Out[208]:
title body score id top_comment_body top_comment_score url body_clean body_clean_str 0 1 2 3 4 5 6 7 8 9
0 AITA for bringing my SIL’s wallet to the resta... Edit: update on profile\n\nMy (f28) SIL “Amy” ... 68476 x2k5kv NTA. Stone cold busted. Next time she books an... 1443 https://www.reddit.com/r/AmItheAsshole/comment... [edit, update, profile, sil, amy, always, come... edit update profile sil amy always comes visit... 0.0 0.000 0.0 0.0 0.0 79.373 0.00 0.0 20.627 0.0
1 AITA for bringing up my brother's "premature" ... I am a nurse practitioner and I am the primary... 56113 zvmflw You can tell the family about the time you wer... 673 https://www.reddit.com/r/AmItheAsshole/comment... [nurse, practitioner, primary, care, provider,... nurse practitioner primary care provider lot l... 0.0 0.000 0.0 0.0 0.0 100.000 0.00 0.0 0.000 0.0
2 AITA for not taking down my video that was a g... I have a sister that’s 6 years older than me. ... 54700 wyjbjs NTA\n\nMy parents missed my wedding too all be... 1563 https://www.reddit.com/r/AmItheAsshole/comment... [sister, thats, years, older, parents, years, ... sister thats years older parents years cancel ... 0.0 0.000 0.0 0.0 0.0 83.143 3.88 0.0 12.977 0.0
3 UPDATE AITA for walking out of the Airport whe... Hello!.\n\n\nI don't know where to begin...it'... 51466 ur2l3s I'm sorry you are going through this, but I'm ... 18673 https://www.reddit.com/r/AmItheAsshole/comment... [hello, know, beginits, absolute, nightmare, r... hello know beginits absolute nightmare recentl... 0.0 0.891 0.0 0.0 0.0 76.377 0.00 0.0 22.732 0.0
4 AITA for walking out of the Airport when I saw... \n\nI F30 don't have the best relationship wit... 50032 unhse2 Definitely NTA. You know that if you had sucke... 9414 https://www.reddit.com/r/AmItheAsshole/comment... [best, relationship, husbands, mom, since, day... best relationship husbands mom since day one t... 0.0 0.000 0.0 0.0 0.0 0.000 0.00 0.0 100.000 0.0
In [210]:
save_object(top_posts_final, 'TM_project/final_df.pkl')
In [213]:
os.system('jupyter nbconvert --to html Code_for_LDA.ipynb')
This application is used to convert notebook files (*.ipynb)
        to various other formats.

        WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.

Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
    Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
    Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
    read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
    Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
    Write notebook output to stdout instead of files.
    Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
    Run nbconvert in place, overwriting the existing notebook (only
            relevant when converting to notebook format)
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
    Clear output of current file and save in place,
            overwriting the existing notebook.
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
    Exclude input and output prompts from converted document.
    Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
    Exclude input cells and output prompts from converted document.
            This mode is ideal for generating code-free reports.
    Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
    Whether to allow downloading chromium if no suitable version is found on the system.
    Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
    Disable chromium security sandbox when converting to PDF..
    Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
    Shows code input. This flag is only useful for dejavu users.
    Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
    Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
    Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
    Whether the HTML in Markdown cells and cell outputs should be sanitized..
    Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
    Set the log level by value or name.
    Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
    Default: 30
    Equivalent to: [--Application.log_level]
--config=<Unicode>
    Full path of a config file.
    Default: ''
    Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
    The export format to be used, either one of the built-in formats
            ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf']
            or a dotted object name that represents the import path for an
            ``Exporter`` class
    Default: ''
    Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
    Name of the template to use
    Default: ''
    Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
    Name of the template file to use
    Default: None
    Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
    Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
    as prebuilt extension for the lab template)
    Default: 'light'
    Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
    Whether the HTML in Markdown cells and cell outputs should be sanitized.This
    should be set to True by nbviewer or similar tools.
    Default: False
    Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
    Writer class used to write the
                                        results of the conversion
    Default: 'FilesWriter'
    Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
    PostProcessor class used to write the
                                        results of the conversion
    Default: ''
    Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
    overwrite base name use for output files.
                can only be used when converting one notebook at a time.
    Default: ''
    Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
    Directory to write output(s) to. Defaults
                                  to output to the directory of each notebook. To recover
                                  previous default behaviour (outputting to the current
                                  working directory) use . as the flag value.
    Default: ''
    Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
    The URL prefix for reveal.js (version 3.x).
            This defaults to the reveal CDN, but can be any url pointing to a copy
            of reveal.js.
            For speaker notes to work, this must be a relative path to a local
            copy of reveal.js: e.g., "reveal.js".
            If a relative path is given, it must be a subdirectory of the
            current directory (from which the server is run).
            See the usage documentation
            (https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
            for more details.
    Default: ''
    Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
    The nbformat version to write.
            Use this to downgrade notebooks.
    Choices: any of [1, 2, 3, 4]
    Default: 4
    Equivalent to: [--NotebookExporter.nbformat_version]

Examples
--------

    The simplest way to use nbconvert is

            > jupyter nbconvert mynotebook.ipynb --to html

            Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'qtpdf', 'qtpng', 'rst', 'script', 'slides', 'webpdf'].

            > jupyter nbconvert --to latex mynotebook.ipynb

            Both HTML and LaTeX support multiple output templates. LaTeX includes
            'base', 'article' and 'report'.  HTML includes 'basic', 'lab' and
            'classic'. You can specify the flavor of the format used.

            > jupyter nbconvert --to html --template lab mynotebook.ipynb

            You can also pipe the output to stdout, rather than a file

            > jupyter nbconvert mynotebook.ipynb --stdout

            PDF is generated via latex

            > jupyter nbconvert mynotebook.ipynb --to pdf

            You can get (and serve) a Reveal.js-powered slideshow

            > jupyter nbconvert myslides.ipynb --to slides --post serve

            Multiple notebooks can be given at the command line in a couple of
            different ways:

            > jupyter nbconvert notebook*.ipynb
            > jupyter nbconvert notebook1.ipynb notebook2.ipynb

            or you can specify the notebooks list in a config file, containing::

                c.NbConvertApp.notebooks = ["my_notebook.ipynb"]

            > jupyter nbconvert --config mycfg.py

To see all available configurables, use `--help-all`.

[NbConvertApp] WARNING | pattern 'Code_for_LDA.ipynb' matched no files
Out[213]:
65280